Lab 1a. Species Distribution Modeling - Exploratory Data Analysis

2.1 Install Packages

# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
} 
## Loading required package: librarian
librarian::shelf(
  dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio, tidyverse)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
## Warning: multiple methods tables found for 'crop'
## Warning: multiple methods tables found for 'extend'
select <- dplyr::select # overwrite raster::select

# set random seed for reproducibility
set.seed(42)

# directory to store data
dir_data <- here("data/sdm") 
dir.create(dir_data, showWarnings = F)

2.2 Choose a Species

I have chosen the giraffe, Giraffa, as my species for this lab. I have always loved large mammals and giraffes are just especially majestic and interesting animals.

2.3 Get Species Observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- TRUE

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Giraffa',
    from = 'gbif', has_coords = T, limit = 10000))
  
  # extract data frame from result
  #df <- res$gbif$data[[1]]  # from Ben 
  df <- res$gbif$data[[1]] %>% 
    filter(longitude > -26.41,
           longitude < 54.98,
           latitude > -36.21,
           latitude < 38.65) # filter for just the African continent 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9379
# show points on map
mapview::mapview(obs, map.types = "Stamen.Terrain")

Question 1. How many observations total are in GBIF for your species? (Hint: ?occ) There are 9,225 giraffe observations in GBIF, filtering for observations on the African continent only. Before filtering, there were 9,300 observations total in GBIF.

Question 2. Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points. There are no odd observations, and all of the points are on land; however, I did decide to filter my observations to those appearing only on the African continent. The observations were most abundant here to start with anyways, and since they are native to Africa I felt that it was a good idea to build the species distribution model based on their native habitat. To filter the observations, I included observations that fell within a bounding box of the African continent.

2.4 Get Environmental Data

2.4.1 Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_climaticMoistureIndex", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

Question: What environmental layers did you choose as predictors? Can you find any support for these in the literature? I decided to use the following layers as predictors: WC_alt, WC_bio1, WC_bio2, ER_topoWet, and ER_climaticMoistureIndex. Because giraffes are typically found at low altitudes, in arid environments like grasslands and savannas, I thought the climatic moisture index and altitude layer were important to include. Additionally, I wanted to include WC_bio1 and WC_bio2 to see if temperature played a role in determining the giraffe habitat since they prefer to live in warmer envrieonments. Lastly, I included the ER_topoWet layer because giraffes typically prefer dry climates and I thought this layer would be a good predictor.

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

2.4.2 Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
## Rows: 18758 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_climaticMoistureIndex, ER...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

2.5 Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))
## Warning: Removed 163 rows containing non-finite values (stat_density).

Lab 1b. Species Distribution Modeling - Logistic Regression

1 Explore (cont’d)

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 18758
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 33 rows containing missing values
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).

## Warning: Removed 35 rows containing missing values (geom_point).

## Warning: Removed 35 rows containing missing values (geom_point).

## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 31 rows containing missing values
## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).

## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 31 rows containing missing values (geom_point).

2 Logistic Regression

2.1 Setup data

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 18723

2.2 Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9515 -0.3238  0.1843  0.3135  1.2728 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.460e-01  4.258e-02  -8.126 4.73e-16 ***
## WC_alt                    2.153e-04  9.853e-06  21.847  < 2e-16 ***
## WC_bio1                   3.160e-02  1.423e-03  22.203  < 2e-16 ***
## WC_bio2                  -1.154e-04  2.009e-03  -0.057    0.954    
## ER_climaticMoistureIndex -2.522e-01  1.591e-02 -15.855  < 2e-16 ***
## ER_topoWet               -4.112e-02  2.790e-03 -14.736  < 2e-16 ***
## lon                       1.261e-02  2.770e-04  45.522  < 2e-16 ***
## lat                      -1.151e-02  2.216e-04 -51.920  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4087 on 18715 degrees of freedom
## Multiple R-squared:  0.3321, Adjusted R-squared:  0.3318 
## F-statistic:  1329 on 7 and 18715 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.4698451  0.9514745
range(y_true)
## [1] 0 1

2.3 Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3128  -0.6827  -0.0385   0.8209   3.2534  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.444e+00  3.355e-01 -25.171   <2e-16 ***
## WC_alt                    1.705e-03  7.307e-05  23.333   <2e-16 ***
## WC_bio1                   3.259e-01  1.337e-02  24.383   <2e-16 ***
## WC_bio2                  -6.694e-03  1.221e-02  -0.548    0.583    
## ER_climaticMoistureIndex -1.572e+00  1.043e-01 -15.073   <2e-16 ***
## ER_topoWet               -2.909e-01  1.773e-02 -16.411   <2e-16 ***
## lon                       8.396e-02  2.089e-03  40.189   <2e-16 ***
## lat                      -8.174e-02  2.032e-03 -40.223   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25956  on 18722  degrees of freedom
## Residual deviance: 18280  on 18715  degrees of freedom
## AIC: 18296
## 
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0004095163 0.9310590901
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")

2.4 Generalized Additive Model

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_climaticMoistureIndex) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_climaticMoistureIndex) + 
##     s(ER_topoWet) + s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6404     0.1617  -10.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df Chi.sq p-value    
## s(WC_alt)                   8.267  8.828  223.2  <2e-16 ***
## s(WC_bio1)                  8.894  8.988  378.6  <2e-16 ***
## s(WC_bio2)                  8.787  8.981  275.2  <2e-16 ***
## s(ER_climaticMoistureIndex) 6.951  7.638  630.1  <2e-16 ***
## s(ER_topoWet)               8.822  8.988  184.1  <2e-16 ***
## s(lon)                      8.995  9.000  492.9  <2e-16 ***
## s(lat)                      8.817  8.985 1243.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.714   Deviance explained = 65.5%
## UBRE = -0.51533  Scale est. = 1         n = 18723
# show term plots
plot(mdl, scale=0)

Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?

Altitude above about 2000, Bio_1 above about 26, Bio_2 above about 17, ER_topo_Wet under 9, and latitude from about -40 to -20 seem to contribute most towards presence. On the flip side, Bio_1 from about 0-20, ER_climaticMoistureIndex from -1 and above, ER_topoWet above 14, and longitude froom about -7 to 0 seem to contribute the most to absense.

2.5 Maxent (Maximum Entropy)

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)

# plot term plots
response(mdl)

Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?

ER_climaticMoistureIndex and WC_bio1 seem to contribute the most towards giraffe presence in the maxent model. This is different from the GAM results because while it seemed like ER_climaticMoistureIndex did influence presence, it was not as obvious how much it contributed to presence. In the term plots above we can see that giraffe presence is heavily influenced by ER_climaticMoistureIndex hovering right around 0. Additionally, WC_bio1 heavily contributed to giraffe presence for values around 20, peaking around 21.

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

# Lab 1c. Species Distribution Modeling - Decision Trees

1. Setup

# global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE)

# load packages
librarian::shelf(
  caret,       # m: modeling framework
  dplyr, ggplot2 ,here, readr, 
  pdp,         # X: partial dependence plots
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip)         # X: variable importance
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# options
options(
  scipen = 999,
  readr.show_col_types = F)
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 18723
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9377, 1: 9346

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 815.31 515.32 -415.00 371.00 734.00 1209.00 3463.00 ▅▇▅▁▁
WC_bio1 0 1 22.84 3.46 5.90 21.00 22.20 25.20 30.60 ▁▁▃▇▃
WC_bio2 0 1 13.95 2.27 5.50 12.60 14.00 15.80 19.60 ▁▂▇▇▂
ER_climaticMoistureIndex 0 1 -0.58 0.29 -1.00 -0.79 -0.61 -0.34 0.60 ▇▆▅▁▁
ER_topoWet 0 1 12.36 1.39 7.30 11.27 12.43 13.23 15.82 ▁▂▇▇▂
lon 0 1 24.24 13.03 -16.54 16.29 29.12 34.30 40.58 ▁▂▂▃▇
lat 0 1 -0.47 16.42 -34.16 -13.18 -0.62 11.96 38.04 ▃▃▇▃▂

1.1 Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9377 9346
table(d_train$present)
## 
##    0    1 
## 7501 7476

2 Decision Trees

2.1 Partition, depth=1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 14977 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 14977 7476 0 (0.5008346 0.4991654)  
##   2) lat>=2.707017 5411  636 0 (0.8824617 0.1175383) *
##   3) lat< 2.707017 9566 2726 1 (0.2849676 0.7150324) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

## 2.2 Partition, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 14977 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 14977 7476 0 (0.50083461 0.49916539)  
##    2) lat>=2.707017 5411  636 0 (0.88246165 0.11753835)  
##      4) WC_bio1< 28.75 4543  175 0 (0.96147920 0.03852080) *
##      5) WC_bio1>=28.75 868  407 1 (0.46889401 0.53110599)  
##       10) lat>=13.61245 367   12 0 (0.96730245 0.03269755) *
##       11) lat< 13.61245 501   52 1 (0.10379242 0.89620758) *
##    3) lat< 2.707017 9566 2726 1 (0.28496759 0.71503241)  
##      6) lon< 30.79615 3526 1455 0 (0.58735111 0.41264889)  
##       12) lat>=-15.13977 1237   79 0 (0.93613581 0.06386419) *
##       13) lat< -15.13977 2289  913 1 (0.39886413 0.60113587)  
##         26) WC_bio1< 21.45 1331  622 0 (0.53268219 0.46731781)  
##           52) ER_climaticMoistureIndex< -0.705 691  231 0 (0.66570188 0.33429812) *
##           53) ER_climaticMoistureIndex>=-0.705 640  249 1 (0.38906250 0.61093750) *
##         27) WC_bio1>=21.45 958  204 1 (0.21294363 0.78705637) *
##      7) lon>=30.79615 6040  655 1 (0.10844371 0.89155629) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.55029428      0 1.0000000 1.0279561 0.008182077
## 2 0.08239700      1 0.4497057 0.4547887 0.006857341
## 3 0.06193151      2 0.3673087 0.3673087 0.006334313
## 4 0.02655163      3 0.3053772 0.3060460 0.005889250
## 5 0.01531568      5 0.2522739 0.2546816 0.005453049
## 6 0.01000000      7 0.2216426 0.2352862 0.005270285

Question: Based on the complexity plot threshold, what size of tree is recommended?

A size of 8 is recommended based on the complexity plot threshold.

2.3 Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

Question: what are the top 3 most important variables of your model?

Based on the plot above, lat, lon, and ER_climaticMoistureIndex are the most important variables of my model.

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

3 Random Forests

# load additional packages
librarian::shelf(
  ranger)  # random forest modeling

3.1 Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1810624

3.2 Feature Interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

Question: How might variable importance differ between rpart and RandomForest in your model outputs?

Besides lat and lon, WC_bio1 and ER_climaticMoistureIndex are the two most important variables in both the rpart model and the random forest model. The bottom three variables are also the same in both models, with ER_topoWet being the least important variable.

Lab 1d. Species Distribution Modeling - Evaluate Models

1. Setup

# global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE)

# load packages
librarian::shelf(
  dismo, # species distribution modeling: maxent(), predict(), evaluate(), 
  dplyr, ggplot2, GGally, here, maptools, readr, 
  raster, readr, rsample, sf,
  usdm)  # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select

# options
set.seed(42)
options(
  scipen = 999,
  readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data      <- here("data/sdm")
pts_geo       <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)

1.1 Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

2 Calibrate: Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables

vif(env_stack)
##                  Variables      VIF
## 1                   WC_alt 2.337270
## 2                  WC_bio1 2.043901
## 3                  WC_bio2 2.682166
## 4 ER_climaticMoistureIndex 2.494296
## 5               ER_topoWet 1.686271
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 1 variables from the 5 input variables have collinearity problem: 
##  
## WC_bio2 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( ER_climaticMoistureIndex ~ WC_bio1 ):  -0.07796855 
## max correlation ( WC_bio1 ~ WC_alt ):  -0.6385333 
## 
## ---------- VIFs of the remained variables -------- 
##                  Variables      VIF
## 1                   WC_alt 1.776481
## 2                  WC_bio1 1.902871
## 3 ER_climaticMoistureIndex 1.235929
## 4               ER_topoWet 1.564684
# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)

# plot variable contributions per predictor
plot(mdl_maxv)

Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?

None of my variables had colliniarity problems, so none were removed. ER_climaticMoistureIndex was the most important, followed by WC_bio1, WC_alt, ER_topoWet, and WC_bio2. The bottom three variables all just barely contributed.

# plot term plots
response(mdl_maxv)

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

3 Evaluate: Model Performance

3.1 Area Under the Curve (AUC), Reciever Operater Characteristic (ROC) Curve and Confusion Matrix

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1869 
## n absences     : 1875 
## AUC            : 0.8478151 
## cor            : 0.6003439 
## max TPR+TNR at : 0.6426489
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6426489
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs   0.92455859   0.3061333
## absent_obs    0.07544141   0.6938667
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")

plot(y_maxv > thr)